# Load packages
library(here)
library(tidyverse)
library(gt)
library(e1071)
library(scales)
library(corrplot)
library(caret)
library(randomForest)
library(glmnet)
library(gbm)
# Read in data
data  <- read_csv(here("inputs//data_prep.csv"))
data <- data %>%
  mutate(room_type = as.factor(room_type), license = as.factor(license))

Introduction

For this analysis we’re explore the listings data of Airbnb rentals in Toronto. The data can be found at link

We’re primarily interested in the rental price however we’ll explore the entire dataset for anything interesting and visualize the results. The analysis will conclude with a model for predicting rental prices.

We’ve previously removed all listings with no reviews, we also remove all listings with no availability within the next year as these are likely no longer actively being rented and removed NA columns from the data.

Data Exploration

We’ll generate some initial summary statistics of the various predictors to get started:

# Check Summary Statistics
summary(data)
##        id            listing_url          scrape_id        
##  Min.   :8.077e+03   Length:9358        Min.   :2.023e+13  
##  1st Qu.:2.568e+07   Class :character   1st Qu.:2.023e+13  
##  Median :4.633e+07   Mode  :character   Median :2.023e+13  
##  Mean   :2.448e+17                      Mean   :2.023e+13  
##  3rd Qu.:6.565e+17                      3rd Qu.:2.023e+13  
##  Max.   :8.411e+17                      Max.   :2.023e+13  
##                                                            
##   last_scraped           source              name           description       
##  Min.   :2023-03-09   Length:9358        Length:9358        Length:9358       
##  1st Qu.:2023-03-10   Class :character   Class :character   Class :character  
##  Median :2023-03-10   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2023-03-09                                                           
##  3rd Qu.:2023-03-10                                                           
##  Max.   :2023-03-26                                                           
##                                                                               
##  neighborhood_overview picture_url           host_id         
##  Length:9358           Length:9358        Min.   :    22795  
##  Class :character      Class :character   1st Qu.: 28909542  
##  Mode  :character      Mode  :character   Median :113615879  
##                                           Mean   :169614255  
##                                           3rd Qu.:283517431  
##                                           Max.   :506119636  
##                                                              
##    host_url          host_name           host_since         host_location     
##  Length:9358        Length:9358        Min.   :2009-06-22   Length:9358       
##  Class :character   Class :character   1st Qu.:2015-03-16   Class :character  
##  Mode  :character   Mode  :character   Median :2017-01-28   Mode  :character  
##                                        Mean   :2017-05-09                     
##                                        3rd Qu.:2019-08-06                     
##                                        Max.   :2023-03-19                     
##                                                                               
##   host_about        host_response_time host_response_rate host_acceptance_rate
##  Length:9358        Length:9358        Length:9358        Length:9358         
##  Class :character   Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character    
##                                                                               
##                                                                               
##                                                                               
##                                                                               
##  host_is_superhost host_thumbnail_url host_picture_url   host_neighbourhood
##  Mode :logical     Length:9358        Length:9358        Length:9358       
##  FALSE:5715        Class :character   Class :character   Class :character  
##  TRUE :3643        Mode  :character   Mode  :character   Mode  :character  
##                                                                            
##                                                                            
##                                                                            
##                                                                            
##  host_listings_count host_total_listings_count host_verifications
##  Min.   :  1.000     Min.   :  1.00            Length:9358       
##  1st Qu.:  1.000     1st Qu.:  1.00            Class :character  
##  Median :  2.000     Median :  3.00            Mode  :character  
##  Mean   :  7.539     Mean   : 12.51                              
##  3rd Qu.:  5.000     3rd Qu.:  8.00                              
##  Max.   :513.000     Max.   :585.00                              
##                                                                  
##  host_has_profile_pic host_identity_verified neighbourhood     
##  Mode :logical        Mode :logical          Length:9358       
##  FALSE:139            FALSE:607              Class :character  
##  TRUE :9219           TRUE :8751             Mode  :character  
##                                                                
##                                                                
##                                                                
##                                                                
##  neighbourhood_cleansed neighbourhood_group_cleansed    latitude    
##  Length:9358            Mode:logical                 Min.   :43.59  
##  Class :character       NA's:9358                    1st Qu.:43.65  
##  Mode  :character                                    Median :43.67  
##                                                      Mean   :43.68  
##                                                      3rd Qu.:43.71  
##                                                      Max.   :43.84  
##                                                                     
##    longitude      property_type                room_type     accommodates   
##  Min.   :-79.62   Length:9358        Entire home/apt:6176   Min.   : 1.000  
##  1st Qu.:-79.43   Class :character   Hotel room     :   1   1st Qu.: 2.000  
##  Median :-79.40   Mode  :character   Private room   :3109   Median : 2.000  
##  Mean   :-79.40                      Shared room    :  72   Mean   : 3.201  
##  3rd Qu.:-79.37                                             3rd Qu.: 4.000  
##  Max.   :-79.13                                             Max.   :16.000  
##                                                                             
##  bathrooms      bathrooms_text        bedrooms          beds       
##  Mode:logical   Length:9358        Min.   :1.000   Min.   : 1.000  
##  NA's:9358      Class :character   1st Qu.:1.000   1st Qu.: 1.000  
##                 Mode  :character   Median :1.000   Median : 1.000  
##                                    Mean   :1.487   Mean   : 1.783  
##                                    3rd Qu.:2.000   3rd Qu.: 2.000  
##                                    Max.   :9.000   Max.   :11.000  
##                                                    NA's   :88      
##   amenities             price         minimum_nights    maximum_nights   
##  Length:9358        Min.   :   14.0   Min.   :   1.00   Min.   :    1.0  
##  Class :character   1st Qu.:   79.0   1st Qu.:   3.00   1st Qu.:   90.0  
##  Mode  :character   Median :  125.0   Median :  28.00   Median :  365.0  
##                     Mean   :  184.3   Mean   :  24.02   Mean   :  557.2  
##                     3rd Qu.:  206.0   3rd Qu.:  28.00   3rd Qu.: 1125.0  
##                     Max.   :51561.0   Max.   :1125.00   Max.   :10001.0  
##                                                                          
##  minimum_minimum_nights maximum_minimum_nights minimum_maximum_nights
##  Min.   :   1.00        Min.   :   1.00        Min.   :1.000e+00     
##  1st Qu.:   3.00        1st Qu.:   3.00        1st Qu.:3.650e+02     
##  Median :  28.00        Median :  28.00        Median :1.125e+03     
##  Mean   :  23.73        Mean   :  25.25        Mean   :6.892e+05     
##  3rd Qu.:  28.00        3rd Qu.:  28.00        3rd Qu.:1.125e+03     
##  Max.   :1125.00        Max.   :1125.00        Max.   :2.147e+09     
##                                                                      
##  maximum_maximum_nights minimum_nights_avg_ntm maximum_nights_avg_ntm
##  Min.   :1.000e+00      Min.   :   1.00        Min.   :1.000e+00     
##  1st Qu.:3.650e+02      1st Qu.:   3.00        1st Qu.:3.650e+02     
##  Median :1.125e+03      Median :  28.00        Median :1.125e+03     
##  Mean   :6.892e+05      Mean   :  24.41        Mean   :6.892e+05     
##  3rd Qu.:1.125e+03      3rd Qu.:  28.00        3rd Qu.:1.125e+03     
##  Max.   :2.147e+09      Max.   :1125.00        Max.   :2.147e+09     
##                                                                      
##  calendar_updated has_availability availability_30 availability_60
##  Mode:logical     Mode :logical    Min.   : 0.00   Min.   : 0.00  
##  NA's:9358        FALSE:2          1st Qu.: 1.00   1st Qu.: 8.00  
##                   TRUE :9356       Median :10.00   Median :32.00  
##                                    Mean   :13.32   Mean   :30.85  
##                                    3rd Qu.:27.00   3rd Qu.:54.00  
##                                    Max.   :30.00   Max.   :60.00  
##                                                                   
##  availability_90 availability_365 calendar_last_scraped number_of_reviews
##  Min.   : 0.00   Min.   :  1.0    Min.   :2023-03-09    Min.   :  1.00   
##  1st Qu.:22.00   1st Qu.: 78.0    1st Qu.:2023-03-10    1st Qu.:  3.00   
##  Median :52.00   Median :172.0    Median :2023-03-10    Median : 12.00   
##  Mean   :49.89   Mean   :183.9    Mean   :2023-03-09    Mean   : 36.03   
##  3rd Qu.:83.00   3rd Qu.:304.0    3rd Qu.:2023-03-10    3rd Qu.: 41.00   
##  Max.   :90.00   Max.   :365.0    Max.   :2023-03-26    Max.   :798.00   
##                                                                          
##  number_of_reviews_ltm number_of_reviews_l30d  first_review       
##  Min.   :  0.00        Min.   : 0.0000        Min.   :2009-08-20  
##  1st Qu.:  1.00        1st Qu.: 0.0000        1st Qu.:2018-09-10  
##  Median :  4.00        Median : 0.0000        Median :2021-07-22  
##  Mean   : 11.14        Mean   : 0.7477        Mean   :2020-06-28  
##  3rd Qu.: 14.00        3rd Qu.: 1.0000        3rd Qu.:2022-09-01  
##  Max.   :155.00        Max.   :17.0000        Max.   :2023-03-09  
##                                                                   
##   last_review         review_scores_rating review_scores_accuracy
##  Min.   :2010-08-11   Min.   :0.000        Min.   :1.000         
##  1st Qu.:2022-08-28   1st Qu.:4.700        1st Qu.:4.760         
##  Median :2022-12-29   Median :4.880        Median :4.920         
##  Mean   :2022-07-02   Mean   :4.747        Mean   :4.791         
##  3rd Qu.:2023-02-19   3rd Qu.:5.000        3rd Qu.:5.000         
##  Max.   :2023-03-21   Max.   :5.000        Max.   :5.000         
##                                            NA's   :26            
##  review_scores_cleanliness review_scores_checkin review_scores_communication
##  Min.   :1.000             Min.   :1.000         Min.   :1.000              
##  1st Qu.:4.670             1st Qu.:4.830         1st Qu.:4.860              
##  Median :4.870             Median :4.960         Median :4.980              
##  Mean   :4.728             Mean   :4.839         Mean   :4.846              
##  3rd Qu.:5.000             3rd Qu.:5.000         3rd Qu.:5.000              
##  Max.   :5.000             Max.   :5.000         Max.   :5.000              
##  NA's   :26                NA's   :26            NA's   :26                 
##  review_scores_location review_scores_value                   license    
##  Min.   :1.00           Min.   :1.000       Exempt                : 182  
##  1st Qu.:4.77           1st Qu.:4.620       Approved by government:  32  
##  Median :4.93           Median :4.800       STR-2104-FGWRVB       :  28  
##  Mean   :4.82           Mean   :4.682       STR-2009-HZPDPM       :  10  
##  3rd Qu.:5.00           3rd Qu.:4.950       STR-2010-GQBDPG       :  10  
##  Max.   :5.00           Max.   :5.000       (Other)               :5319  
##  NA's   :26             NA's   :26          NA's                  :3777  
##  instant_bookable calculated_host_listings_count
##  Mode :logical    Min.   :  1.000               
##  FALSE:7568       1st Qu.:  1.000               
##  TRUE :1790       Median :  2.000               
##                   Mean   :  5.094               
##                   3rd Qu.:  4.000               
##                   Max.   :141.000               
##                                                 
##  calculated_host_listings_count_entire_homes
##  Min.   :  0.000                            
##  1st Qu.:  0.000                            
##  Median :  1.000                            
##  Mean   :  3.407                            
##  3rd Qu.:  2.000                            
##  Max.   :141.000                            
##                                             
##  calculated_host_listings_count_private_rooms
##  Min.   : 0.000                              
##  1st Qu.: 0.000                              
##  Median : 0.000                              
##  Mean   : 1.634                              
##  3rd Qu.: 2.000                              
##  Max.   :29.000                              
##                                              
##  calculated_host_listings_count_shared_rooms reviews_per_month
##  Min.   : 0.00000                            Min.   : 0.010   
##  1st Qu.: 0.00000                            1st Qu.: 0.300   
##  Median : 0.00000                            Median : 0.770   
##  Mean   : 0.05076                            Mean   : 1.412   
##  3rd Qu.: 0.00000                            3rd Qu.: 2.000   
##  Max.   :14.00000                            Max.   :13.200   
## 

Some initial things to note from the summary statistics is that the vast majority of listings(67.7%) are entire homes or apartments as opposed to shared living spaces. Private rooms make up the bulk of the remainder at 31.5%.

data %>%
  select(room_type) %>%
  group_by(room_type) %>%
  summarize(listings = n()) %>%
  ungroup() %>%
  mutate(room_type = fct_reorder(room_type, listings)) %>%
  ggplot(aes(x = room_type, y = listings, fill = room_type)) +
  geom_bar(stat = "identity") + 
  theme(legend.position = "none") +
  xlab("Room Type") +
  ylab("Listings")

Like any other residential property the neighbourhood is a likely predictor of the price of the property being rented so next be look at the distribution of

Lets now look at the distribution of rental prices:

ggplot(data, aes(x = price)) +
  geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  labs(title = "Distribution of Airbnb Rental Prices",
       x = "Price",
       y = "Count")

I’ve identified the major outlier in this case to be this listing. To get a better idea we can check for example how many listings are there above a price of $5000?

dim(data %>%
  filter(price > 5000))[1]
## [1] 3

We find there are only 13 listings. We can then look at a boxplot of the remaining listings after removing the ones above 5000.

data %>%
  filter(price < 5000) %>%
  ggplot(aes(x = price)) +
  geom_boxplot(color = "blue", outlier.color = "red", outlier.size = 2) +
  scale_x_continuous(breaks = c(0,1000,2000, 3000, 4000, 5000)) +
  ylim(-4,4) + 
  stat_boxplot(geom ='errorbar') +
  theme(axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank())

The data seems to get much more spread out above a price of $1500, so we’ll focus in on those data points:

data %>%
  filter(price < 1500) %>%
  ggplot(aes(x = price)) +
  geom_boxplot(color = "blue", outlier.color = "red", outlier.size = 2) +
  scale_x_continuous(breaks = seq(from = 0, to = 1500, by = 250)) +
  ylim(-4,4) + 
  stat_boxplot(geom ='errorbar') +
  theme(axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank())

This seems like a better representation of most of the price data for the listings.

As with any property value we may expect things like the number of bedrooms in the listing to be a predictor of price. Larger properties should have higher bedroom counts and thus be more expensive to rent:

data %>%
  filter(price < 1500) %>%
  mutate(bedrooms = as.factor(bedrooms)) %>%
  select(price, bedrooms) %>%
  group_by(bedrooms) %>%
  summarize(mean_price = mean(price)) %>%
  ungroup() %>%
  ggplot(aes(x = bedrooms, y = mean_price, fill = bedrooms)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(breaks = seq(from = 0, to = 1500, by = 250)) +
  theme(legend.position = "none") +
  xlab("# of Bedrooms") +
  ylab("Mean Price")

There appears to be only 1 listing with 9 bedrooms.

data %>%
  filter(price < 1500 & bedrooms == 9)

Because the price is so wildly off from the overall trend I think it’s best to remove this point in the model.

Next it seems likely review scores should be a strong predictor of prices:

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  ggplot(aes(x = review_scores_rating, y = price)) +
    geom_point(alpha = 0.5, color = "#43a2ca") +
    geom_smooth(method = "loess", se = FALSE, color = "red") +
  labs(title = "Price vs. Review Scores Rating",
       x = "Review Scores",
       y = "Price")

We can similarly do some quick visualizations to see the relationships among other predictors and the price variable:

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  ggplot(aes(x = minimum_nights)) + geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  labs(title = "Distribution of Minimun Nights",
       x = "Nights",
       y = "Count")

Clearly we have a pretty wide tail to the right on this distribution, suggesting that there are again a few extreme outliers in this data.

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  select(minimum_nights) %>%
  group_by(minimum_nights) %>%
  summarize(n = n()) %>%
  arrange(desc(n))
data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & minimum_nights > 1000)

Interestingly by far the most common minimum nights selection on AirBNB is full month stays. Followed by 1,2,3 nights. The outliers we’re seeing in minimum nights largely seem to be inactive listings, with last reviews here for example being from 2016. We would like to remove listings like this but instead of making a cut off for minimum nights we will seek to do this by last review date. This ensures that the pricing we’re seeing in the data is currently active listings.

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  ggplot(aes(x = last_review)) + geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  labs(title = "Distribution of Latest Review Date",
       x = "Last Review Date",
       y = "Count")

As we can see the vast majority of these listings are recent, however we are getting a stretch of listings dating back all the way to 2015. For the sake of ensuring the pricing is accurate especially considering the price disturbances caused during Covid, we’ll remove listings who have not received a review after 2020.

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01')

Next we’ll check the relationship of the number of reviews for a listing and it’s price

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01') %>%
  ggplot(aes(x = number_of_reviews, y = price)) + geom_point( color = "blue") +
  labs(title = "Number of Reviews vs Price",
       x = "Number of Reviews",
       y = "Price")

There doesn’t appear to be any correlation here but we may the listings with very high review counts make this data difficult to see, we can visualize this again with a limit on the number of reviews:

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01' & number_of_reviews < 180) %>%
  ggplot(aes(x = number_of_reviews, y = price)) + geom_point( color = "blue") +
  labs(title = "Number of Reviews vs Price",
       x = "Number of Reviews",
       y = "Price")

No obvious correlation seems to be present in the relationship between the number of reviews and the price of the listing.

We should also check for linear correlations between predictors, we can visualize this via a correlation matrix

df <- data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01') %>%
  select(price, minimum_nights, number_of_reviews, review_scores_rating, calculated_host_listings_count)

corrplot(cor(as.matrix(df),method = "spearman"),
                 method = "color",
                  tl.cex = 0.9,
                 number.cex = 0.95,
                 addCoef.col = "black")

None of these values seem to be particularly large so it seems we can safely assume that there is no multicolinearity present in these predictors.

Model Selection

df <- data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01') %>%
  select(id, longitude, latitude, price, room_type, minimum_nights, number_of_reviews, last_review, reviews_per_month, availability_365, bedrooms, calculated_host_listings_count,
         number_of_reviews_ltm) %>% drop_na()

We would like to find a model that outperforms linear regression trained using all predictors. This is the simplest model we could employ, though we found no evidence that the predictors have linear relationships with price it will serve as a good baseline to assess the performance of other models which are more computationally expensive to train.

set.seed(1)

train_data <- df %>%
  sample_frac(0.70)

test_data <- anti_join(df, train_data, by = 'id')

# Train model
mod.lm <- lm(price ~ .-id, data = train_data)

# Test model on test set
predictions <- predict(mod.lm, test_data, type = "response")

# Calculating RMSE and MAE
mae <- mean(abs(test_data$price - predictions))
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 71.10984
rmse <- sqrt(mean((test_data$price - predictions)^2))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 117.7678

Now we may want to test some other model techniques namely ones that contain some sort of feature selection. To do that we will use random forest and lasso regression. Ideally these models will produce lower error rates then our standard linear model as they should capture some of the non linear relationships we observed in the data exploration phase.

set.seed(1)

# Split data train/test

train_data <- df %>%
  sample_frac(0.70)

test_data <- anti_join(df, train_data, by = 'id')

# Defining Control Parameters
control_rfe <- rfeControl(functions = rfFuncs,
                          method = "cv",
                          number = 5,
                          verbose = FALSE)

# Defining Predictors
predictors <- names(train_data[, !(names(train_data) %in% c("price"))])

# Run RFE
rfe_result <- rfe(train_data[, predictors],
                  train_data$price,
                  sizes = c(1:10),
                  rfeControl = control_rfe)
print(rfe_result)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 108.90   0.3315 73.98  3.025    0.03220 1.076         
##          2 106.47   0.3715 69.76  3.854    0.02159 1.750         
##          3 104.21   0.4009 66.81  4.457    0.03229 2.554         
##          4 100.29   0.4530 62.84  4.656    0.03683 2.166         
##          5  99.41   0.4657 61.61  4.582    0.03300 2.300         
##          6  95.23   0.4881 57.10  2.969    0.01019 1.194         
##          7  94.61   0.4958 56.77  4.019    0.02127 2.033         
##          8  92.77   0.5179 55.29  4.085    0.02067 1.900         
##          9  91.93   0.5242 54.83  3.686    0.01954 1.381         
##         10  91.76   0.5273 54.65  3.462    0.02065 1.394         
##         12  91.04   0.5354 53.91  3.676    0.02047 1.941        *
## 
## The top 5 variables (out of 12):
##    bedrooms, room_type, minimum_nights, latitude, number_of_reviews_ltm

The top 5 of 10 predictors are bedrooms, minimum_nights, room_type, latitude and longitude

set.seed(1)

best_features <- predictors[rfe_result$optVariables]

# Train the final model with the selected features
final_model <- randomForest(price ~ ., data = train_data[, c("price", rfe_result$optVariables)])

# Evaluate the model performance on the test dataset
predictions <- predict(final_model, newdata = test_data[, rfe_result$optVariables])
performance <- postResample(predictions, test_data$price)
print(performance)
##        RMSE    Rsquared         MAE 
## 104.4002474   0.5593081  58.2315611

As expected random forest performs better then the standard linear regression. Now we build a lasso regression model.

set.seed(1)

# Creating model matrix of predictors for Lasso fucntion
X <- model.matrix(price ~ . -1, data = train_data)
X.test <- model.matrix(price ~ . -1, data = test_data)
Y <- train_data$price
Y.test <- test_data$price

# Fit LASSO model
lasso.cv <- cv.glmnet(X, Y, alpha = 1, nfolds = 5)
# Finding optimal lambda
optimal <- lasso.cv$lambda.min
# Fit the optimal lambda model
lasso.mod <- glmnet(X, Y, alpha = 1, lambda = optimal)

# Make predictions on new test data
lasso.pred <- predict(lasso.mod, optimal, newx = X.test)

# Compute error
mae <- mean(abs(lasso.pred-Y.test))
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 70.63944
rmse <- sqrt(mean((lasso.pred-Y.test)^2))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 117.3701

In this case we find the lasso not to be a significant improvement over the linear regression model. This is most likely caused by lasso not capturing non linear relationships which are captured in random forest.

The next method we will assess is gradient boosting machines

set.seed(1)

gbm_model <- gbm(price ~ .-last_review,data = train_data,  distribution = "gaussian",
                 n.trees = 1000,
                 interaction.depth = 3,
                 shrinkage = 0.01,
                 n.minobsinnode = 10,
                 cv.folds = 5,
                 verbose = FALSE)

# Determine the optimal tree number
optimal_trees <- gbm.perf(gbm_model, method = "cv")

# Compute error rates
predictions <- predict(gbm_model, newdata = test_data, n.trees = optimal_trees)
performance <- postResample(predictions, test_data$price)
print(performance)
##        RMSE    Rsquared         MAE 
## 105.0167686   0.5511735  59.4993410

Gradient boosting produces slightly better error rates then the random forest method. Next we will build a support vector machine model, to do this we will normalize the features so they are all using the same scale.

set.seed(1)

# Normalizing train and test split
preprocess_params <- preProcess(train_data[, -which(names(train_data) == "price")], method = c("center", "scale"))

train_data_scaled <- predict(preprocess_params, train_data)
test_data_scaled <- predict(preprocess_params, test_data)
test_data_scaled$price <- test_data$price

# Training model with svm()
svm_model <- svm(price ~ .,
                 data = train_data_scaled,
                 kernel = "radial",
                 cost = 1,
                 gamma = 0.1,
                 epsilon = 0.1)

# Computing error rates
predictions <- predict(svm_model, newdata = test_data_scaled)
performance <- postResample(predictions, test_data_scaled$price)
print(performance)
##        RMSE    Rsquared         MAE 
## 110.1921828   0.5280257  58.3953594

The SVM performs the best of our models, with RMSE of 108 vs the next best performing model, the random forest of 115.

library(ggplot2)

pred_vs_true <- data.frame(Predicted = predictions, True = test_data_scaled$price)
ggplot(pred_vs_true, aes(x = True, y = Predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "SVM Model: Predicted vs. True Prices",
       x = "True Prices",
       y = "Predicted Prices") +
  theme_minimal()

SVM shows a 20% improvement in Mean absolute error over our baseline linear regression, and a 7% improvement in Root mean square error.